Brain Stroke dataset - https://www.kaggle.com/datasets/jillanisofttech/brain-stroke-dataset/
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import dalex as dx
import xgboost as xgb
import shap
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
path = r'C:\Users\Mateusz\Downloads\brain_stroke.csv'
df = pd.read_csv(path)
# Data preview
df
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
| 1 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
| 2 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
| 3 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
| 4 | Male | 81.0 | 0 | 0 | Yes | Private | Urban | 186.21 | 29.0 | formerly smoked | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4976 | Male | 41.0 | 0 | 0 | No | Private | Rural | 70.15 | 29.8 | formerly smoked | 0 |
| 4977 | Male | 40.0 | 0 | 0 | Yes | Private | Urban | 191.15 | 31.1 | smokes | 0 |
| 4978 | Female | 45.0 | 1 | 0 | Yes | Govt_job | Rural | 95.02 | 31.8 | smokes | 0 |
| 4979 | Male | 40.0 | 0 | 0 | Yes | Private | Rural | 83.94 | 30.0 | smokes | 0 |
| 4980 | Female | 80.0 | 1 | 0 | Yes | Private | Urban | 83.75 | 29.1 | never smoked | 0 |
4981 rows × 11 columns
# Null check
df.isnull().all()
gender False age False hypertension False heart_disease False ever_married False work_type False Residence_type False avg_glucose_level False bmi False smoking_status False stroke False dtype: bool
# Data type check
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4981 entries, 0 to 4980 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 4981 non-null object 1 age 4981 non-null float64 2 hypertension 4981 non-null int64 3 heart_disease 4981 non-null int64 4 ever_married 4981 non-null object 5 work_type 4981 non-null object 6 Residence_type 4981 non-null object 7 avg_glucose_level 4981 non-null float64 8 bmi 4981 non-null float64 9 smoking_status 4981 non-null object 10 stroke 4981 non-null int64 dtypes: float64(3), int64(3), object(5) memory usage: 428.2+ KB
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4981 entries, 0 to 4980 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 4981 non-null category 1 age 4981 non-null float64 2 hypertension 4981 non-null int64 3 heart_disease 4981 non-null int64 4 ever_married 4981 non-null category 5 work_type 4981 non-null category 6 Residence_type 4981 non-null category 7 avg_glucose_level 4981 non-null float64 8 bmi 4981 non-null float64 9 smoking_status 4981 non-null category 10 stroke 4981 non-null int64 dtypes: category(5), float64(3), int64(3) memory usage: 258.7 KB
X = df.drop(columns='stroke')
# convert gender to binary only because the `max_cat_to_onehot` parameter in XGBoost is yet to be working properly..
X = pd.get_dummies(X, columns=['gender', 'ever_married', 'Residence_type'], drop_first=True)
y = df.stroke
model_categorical = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss",
enable_categorical=True,
tree_method="hist", seed=77
)
model_categorical.fit(X, y)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=True,
eval_metric='logloss', gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=4,
max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=200, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=77,
reg_alpha=0, reg_lambda=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=True,
eval_metric='logloss', gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=4,
max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=200, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=77,
reg_alpha=0, reg_lambda=1, ...)model_categorical.predict_proba(X.iloc[0:2])
array([[0.30083025, 0.69916975],
[0.47414333, 0.5258567 ]], dtype=float32)
model = model_categorical
def pf_xgboost_classifier_categorical(model, df):
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
return model.predict_proba(df)[:, 1]
explainer = dx.Explainer(model, X, y,
predict_function=pf_xgboost_classifier_categorical,
label="GBM")
Preparation of a new explainer is initiated -> data : 4981 rows 10 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 4981 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : GBM -> predict function : <function pf_xgboost_classifier_categorical at 0x000001EA1C0AC1F0> will be used -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 1.07e-07, mean = 0.0498, max = 0.94 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.433, mean = -2.79e-05, max = 0.868 -> model_info : package xgboost A new explainer has been created!
explainer.model_performance()
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| GBM | 0.717742 | 1.0 | 0.835681 | 0.985947 | 0.999079 |
df.iloc[0]
gender Male age 67.0 hypertension 0 heart_disease 1 ever_married Yes work_type Private Residence_type Urban avg_glucose_level 228.69 bmi 36.6 smoking_status formerly smoked stroke 1 Name: 0, dtype: object
print("Pobability of the stroke for the first observation equals : ", explainer.predict(X.iloc[[0]]))
Pobability of the stroke for the first observation equals : [0.69916975]
df.iloc[1]
gender Male age 80.0 hypertension 0 heart_disease 1 ever_married Yes work_type Private Residence_type Rural avg_glucose_level 105.92 bmi 32.5 smoking_status never smoked stroke 1 Name: 1, dtype: object
print("Pobability of the stroke for the second observation equals : ", explainer.predict(X.iloc[[1]]))
Pobability of the stroke for the second observation equals : [0.5258567]
Next, for the same observations, I have calculated the decomposition of predictions, so-called variable attributions, using SHAP
shap_attributions = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in range(2)]
shap_attributions[0].plot(shap_attributions[1::])
For the first patient probability of having a stroke is mostly affected by the heart_disease=1 and age=67 factors. Factors which has the least impact on the final probability predictions are residence_type_urban=1 and having no hypertension. The final probability of having a stroke for that patient equals 70%.
For the second patient probability of having a stroke is most affected by the age=80 factor. However, smoking_status = never smoked to some extent decreases the probability of having a stroke. Factors which has the least impact on the final probability predictions are having no hypertension and residence_type_urban=1. The final probability of having a stroke for that patient equals 53%.
bd_attributions = [explainer.predict_parts(X.iloc[[i]], type="break_down", label=f'patient {i}') for i in range(2)]
bd_attributions[0].plot(bd_attributions[1::])
Above break down plots for the observations 0 and 1 are shown.
Find any two observations in the dataset, such that they have different variables of the highest importance.
I have found, that bobservations 1 and 2 have different variables of highest importance - for the observation 1 the two most importsnt variables are age and smoking status, whereas for the observation2 it is avg_glucose_level and private work type.
shap_attributions12 = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in range(1,3)]
shap_attributions12[0].plot(shap_attributions12[1::])
Observations 2 and 12 has different attribution values for the BMI index: Observationstion 2 has a negative attribution, whereas the observation 12 has a positive attribution for BMI index.
df.iloc[2]
gender Female age 49.0 hypertension 0 heart_disease 0 ever_married Yes work_type Private Residence_type Urban avg_glucose_level 171.23 bmi 34.4 smoking_status smokes stroke 1 Name: 2, dtype: object
df.iloc[12]
gender Female age 50.0 hypertension 1 heart_disease 0 ever_married Yes work_type Self-employed Residence_type Rural avg_glucose_level 167.41 bmi 30.9 smoking_status never smoked stroke 1 Name: 12, dtype: object
shap_attributions2_12 = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in [2,12]]
shap_attributions2_12[0].plot(shap_attributions2_12[1::])
categorical_variables = ['work_type', 'smoking_status']
X_ohe = pd.get_dummies(X, columns=categorical_variables, drop_first=True)
X_ohe
| age | hypertension | heart_disease | avg_glucose_level | bmi | gender_Male | ever_married_Yes | Residence_type_Urban | work_type_Private | work_type_Self-employed | work_type_children | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67.0 | 0 | 1 | 228.69 | 36.6 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 80.0 | 0 | 1 | 105.92 | 32.5 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 49.0 | 0 | 0 | 171.23 | 34.4 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
| 3 | 79.0 | 1 | 0 | 174.12 | 24.0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 4 | 81.0 | 0 | 0 | 186.21 | 29.0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4976 | 41.0 | 0 | 0 | 70.15 | 29.8 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4977 | 40.0 | 0 | 0 | 191.15 | 31.1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4978 | 45.0 | 1 | 0 | 95.02 | 31.8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4979 | 40.0 | 0 | 0 | 83.94 | 30.0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4980 | 80.0 | 1 | 0 | 83.75 | 29.1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
4981 rows × 14 columns
model_ohe = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss"
)
model_ohe.fit(X_ohe, y)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric='logloss', gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=4,
max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=200, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric='logloss', gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=4,
max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=200, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)shap_explainer = shap.explainers.Tree(model_ohe, data=X_ohe, model_output="probability")
shap_values = shap_explainer(X_ohe)
98%|===================| 4885/4981 [00:16<00:00]
shap_values
.values =
array([[ 2.42216580e-01, 6.51509072e-03, 1.24446426e-01, ...,
-5.27332844e-03, 1.86445604e-04, -2.34876625e-03],
[ 3.01769011e-01, 3.45309145e-03, -7.76061660e-03, ...,
-8.95937774e-03, -4.81551304e-03, 1.55096793e-03],
[ 5.28487591e-02, 3.29651151e-04, -2.98279288e-03, ...,
-1.41098869e-03, 8.58337259e-03, 2.86821600e-02],
...,
[-2.43981254e-02, 6.07724280e-03, 1.24344725e-04, ...,
-1.34687440e-03, 4.12897056e-04, 2.29359779e-03],
[-4.30838351e-02, -2.39149124e-05, 3.54136606e-04, ...,
-1.56279770e-03, 1.16184691e-03, 3.64034616e-04],
[ 6.04202168e-02, 9.29519471e-03, 1.31577934e-03, ...,
-4.91131149e-06, -3.41092317e-03, 6.98445007e-04]])
.base_values =
array([0.04030465, 0.04030465, 0.04030465, ..., 0.04030465, 0.04030465,
0.04030465])
.data =
array([[67., 0., 1., ..., 1., 0., 0.],
[80., 0., 1., ..., 0., 1., 0.],
[49., 0., 0., ..., 0., 0., 1.],
...,
[45., 1., 0., ..., 0., 0., 1.],
[40., 0., 0., ..., 0., 0., 1.],
[80., 1., 0., ..., 0., 1., 0.]])
for i in range(2):
shap.plots.waterfall(shap_values[i])
shap.plots.beeswarm(shap_values, max_display=10, plot_size=(9, 6))
shap.plots.bar(shap_values, max_display=10, show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "gender_Male"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "bmi"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "avg_glucose_level"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
from sklearn.svm import SVC
svm_ohe = SVC(probability=True)
svm_ohe.fit(X_ohe.values, y)
SVC(probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(probability=True)
X_subset = X_ohe.sample(111, random_state=0)
exp_svm = dx.Explainer(svm_ohe, X_subset, label="SVM", verbose=False)
exp_xgboost = dx.Explainer(model_ohe, X_subset, label="GBM", verbose=False)
sv_svm = [exp_svm.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(2)]
Using 111 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
Using 111 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
sv_xgboost = [exp_xgboost.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(2)]
Using 111 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
Using 111 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
for i in range(2):
print(f'==== Patient {i} ====')
print('---- XGBoost:')
sv_xgboost[i].plot()
print('---- SVM:')
sv_svm[i].plot()
==== Patient 0 ==== ---- XGBoost:
---- SVM:
==== Patient 1 ==== ---- XGBoost:
---- SVM:
Here we can see huge differences between the XGBoost and SVM models in creating the explanations for first two observations.
shap.initjs()
shap.plots.force(shap_values[:500])
The plot above is interactive - feel free to play with it!